package cerebrum.crawler;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 图片爬虫
 */
public class CrawlerUtil {
    /**
     * http连接
     *
     * @param url
     * @return
     * @throws IOException
     */
    public static Document webCrawlerHTTP(String url) throws IOException {
        Connection tempConn = Jsoup.connect(url);
        //模拟浏览器的请求头
        tempConn.header("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20100101 Firefox/29.0");
        //开始连接HTTP请求。
        Connection.Response demo = tempConn.ignoreContentType(true).method(Connection.Method.GET).execute();
        return demo.parse();
    }

    /**
     * image url 正则匹配器
     * @param source
     * @return
     */
    public static List<String> getImages(String source){
        Matcher m = Pattern.compile("(http(s?):)([/|.|\\w|\\s|-])*\\.(?:jpg|jpeg|png)").matcher(source);
        List<String> list = new ArrayList();
        while(m.find()){
            list.add(m.group());
        }
        return list;
    }
}
